{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Minimal RNN notebook\n",
    "\n",
    "### RNN representation\n",
    "\n",
    "Contrary to a Feed Forward Neural Network, an RNN is a recurrent neural network, in which the information flow is not linear. A general representation can be seen as follows:\n",
    "\n",
    "![Representation](img/rnn_simple.svg)\n",
    "\n",
    "An RNN is useful to deal with sequential information: a sequence of inputs is fed through the network and the hidden state is updated at each step of the sequence. The sequence is commonly represented as a time sequence, and the most straight forward learning algorithm is backpropagation through time (BPTT) http://en.wikipedia.org/wiki/Backpropagation_through_time.\n",
    "\n",
    "To understand properly BPTT, a better representation of the RNN is its unfolded version:\n",
    "\n",
    "![Representation](img/rnn_unfolded.svg)\n",
    "\n",
    "The input X is a sequence $x_0, x_1, ... x_t$, at each time-step t a new input $x_t$ is fed to the network.\n",
    "\n",
    "### Equations\n",
    "\n",
    "The most simple forward equations for a RNN are as follows:\n",
    "\n",
    "$$h_t = \\tanh(x_t . W_{in} + h_{t-1} . W_{rec})$$\n",
    "$$y_t = softmax(h_t . W_{out})$$\n",
    "\n",
    "Depending on the problem, all the outputs $y_0, ... y_t$ might be useful, or just $y_t$ the last one."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import theano\n",
    "import theano.tensor as T\n",
    "from theano import shared \n",
    "from collections import OrderedDict\n",
    "\n",
    "dtype=T.config.floatX\n",
    "theano.config.optimizer='fast_compile'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def init_weight(shape, name, sample='uni'):\n",
    "    if sample=='unishape':\n",
    "        return shared(value=np.asarray(np.random.uniform(\n",
    "                low=-np.sqrt(6. / (shape[0] + shape[1])),\n",
    "                high=np.sqrt(6. / (shape[0] + shape[1])),\n",
    "                size=shape), dtype=dtype), \n",
    "                    name=name, borrow=True)\n",
    "    \n",
    "    if sample=='svd':\n",
    "        values = np.ndarray(shape, dtype=dtype)\n",
    "        for dx in xrange(shape[0]):\n",
    "            vals = np.random.uniform(low=-1., high=1.,  size=(shape[1],))\n",
    "            values[dx,:] = vals\n",
    "        _,svs,_ = np.linalg.svd(values)\n",
    "        #svs[0] is the largest singular value                      \n",
    "        values = values / svs[0]\n",
    "        return shared(values, name=name, borrow=True)\n",
    "    \n",
    "    if sample=='uni':\n",
    "        return shared(value=np.asarray(np.random.uniform(low=-0.1,high=0.1, size=shape), dtype=dtype), \n",
    "                      name=name, borrow=True)\n",
    "    \n",
    "    if sample=='zero':\n",
    "        return shared(value=np.zeros(shape=shape, dtype=dtype), \n",
    "                      name=name, borrow=True)\n",
    "    \n",
    "    \n",
    "    raise \"error bad sample technique\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "class Rnn:\n",
    "    def __init__(self, n_in, n_hid, n_out, lr):   \n",
    "        self.n_in = n_in\n",
    "        self.n_hid = n_hid\n",
    "        self.n_out = n_out\n",
    "        self.W_in = init_weight((self.n_in, self.n_hid),'W_in', 'svd')\n",
    "        self.W_out = init_weight((self.n_hid, self.n_out),'W_out', 'svd')\n",
    "        self.W_rec = init_weight((self.n_hid, self.n_hid),'W_rec', 'svd')\n",
    "        self.b_out = init_weight((self.n_out), 'b_out','zero')\n",
    "        self.params = [self.W_in,self.W_out,self.W_rec, self.b_out]\n",
    "        \n",
    "        def step(x_t, h_tm1):\n",
    "            h_t = T.tanh(T.dot(x_t, self.W_in) + T.dot(h_tm1, self.W_rec))\n",
    "            y_t = T.nnet.softmax(- (T.dot(h_t, self.W_out) + self.b_out))            \n",
    "            return [h_t, y_t]\n",
    "\n",
    "        X = T.matrix() # X is a sequence of vectors\n",
    "        Y = T.matrix() # Y is a sequence of vectors\n",
    "        h0 = shared(np.zeros(self.n_hid, dtype=dtype)) # initial hidden state         \n",
    "        lr = shared(np.cast[dtype](lr))\n",
    "        \n",
    "        [h_vals, y_vals], _ = theano.scan(fn=step,                                  \n",
    "                                          sequences=X,\n",
    "                                          outputs_info=[h0, None])\n",
    "        \n",
    "        #h_vals is a sequence of hidden states\n",
    "        #y_vals is a sequence of outputs\n",
    "        \n",
    "        # compute cost : cross entropy cost\n",
    "        cost = -T.mean(Y * T.log(y_vals)+ (1.- Y) * T.log(1. - y_vals))        \n",
    "        # for mean squared error, use \n",
    "        # cost = -T.mean((Y - y_vals) ** 2)\n",
    "        \n",
    "        gparams = T.grad(cost, self.params)\n",
    "        updates = OrderedDict()\n",
    "        for param, gparam in zip(self.params, gparams):\n",
    "            updates[param] = param - gparam * lr\n",
    "                \n",
    "        self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates)\n",
    "        self.predictions = theano.function(inputs = [X], outputs = y_vals)\n",
    "        self.debug = theano.function(inputs = [X, Y], outputs = [X.shape, Y.shape, h_vals.shape, y_vals.shape])\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model = Rnn(7, 50, 7, 0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#sequences of 100 elements and vector size 7\n",
    "X = np.random.uniform(low=-0.1, high=0.1, size=(100,7)).astype(dtype=dtype) \n",
    "Y = np.random.uniform(low=-0.1, high=0.1, size=(100,7)).astype(dtype=dtype)\n",
    "\n",
    "print(model.debug(X,Y))\n",
    "model.predictions(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.train(X,Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "nb_epochs = 100\n",
    "#stupid and naive sgd\n",
    "for x in range(nb_epochs):\n",
    "    error = 0.\n",
    "    for j in range(len(train_data)):  \n",
    "        index = np.random.randint(0, len(train_data))\n",
    "        i, o = train_data[index]\n",
    "        train_cost = model.train(i, o)\n",
    "        error += train_cost\n",
    "    if x%10==0:\n",
    "            print \"epoch \"+str(x)+ \" error: \"+str(error)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
